library(tantale)
library(ggplot2)
library(tidyverse)
library(Biostrings)
library(parallel)
library(magrittr)
library(DT)
library(gplots)
library(ape)
library(ggtree)
This vignette will go through all steps in a talomes analysis
pipeline from Xoo genome sequences using functions in package
tantale.
Genomes used for this vignette are MAI1, BAI3, and BAI3-1-1 (an
error-prone genome with TalC deleted).
Get the fasta files:
mai1_fa <- system.file("extdata", "MAI1.fa", package = "tantale", mustWork = T)
bai3_fa <- system.file("extdata", "BAI3.fa", package = "tantale", mustWork = T)
bai311_fa <- system.file("extdata", "BAI3-1-1.fa", package = "tantale", mustWork = T)
pxo86 <- system.file("extdata", "PXO86.fa", package = "tantale", mustWork = T)
Output dir to save all the results of this vignette:
outdir <- fs::dir_create("~/TEMP/test_tantale") # tempdir(check = TRUE)
telltale2_outdir <- file.path(outdir, "telltale2")
dir.create(telltale2_outdir, showWarnings = T)
We will run telltale2() without correction for “gold”
genomes, MAI1 and BAI3, and telltale2() with correction for
“error-prone” genome BAI3-1-1.
By default, telltale2() adds NTERM and CTERM to RVD
sequences as markers of the termini, that is important to map the repeat
codes by DisTal with the RVDs.
mclapply(c(mai1_fa, bai3_fa, pxo86), function(fastaFile) {
outdirname <- gsub(".fa", "_nocorrection", basename(fastaFile))
outputDir <- file.path(telltale2_outdir, outdirname)
tellTale(subjectFile = fastaFile,
outputDir = outputDir,
appendExtremityCodes = TRUE,
rvdSep = "-",
talArrayCorrection = FALSE)
}, mc.cores = 2)
#>> [[1]]
#>> [1] "/home/cunnac/TEMP/test_tantale/telltale2/MAI1_nocorrection"
#>>
#>> [[2]]
#>> [1] "/home/cunnac/TEMP/test_tantale/telltale2/BAI3_nocorrection"
#>>
#>> [[3]]
#>> [1] "/home/cunnac/TEMP/test_tantale/telltale2/PXO86_nocorrection"
tellTale(subjectFile = bai311_fa,
outputDir = file.path(telltale2_outdir, gsub(".fa", "_correction", basename(bai311_fa))),
TALE_CtermDNAHitMinScore = 300,
appendExtremityCodes = TRUE,
rvdSep = "-",
talArrayCorrection = TRUE)
#>> # HMMER 3.3 (Nov 2019); http://hmmer.org/
#>> # Copyright (C) 2019 Howard Hughes Medical Institute.
#>> Finding the closest reference amino acid sequences:
#>> ================================================================================
#>>
#>> Time difference of 0.55 secs
#>> Assessing frameshifts in nucleotide sequences:
#>> ================================================================================
#>>
#>> Time difference of 400.62 secs
#>> ## Now running annoTALE analyze for putativeTalOrf using the following command:
#>> ## java -jar /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/tools/AnnoTALEcli-1.4.1.jar analyze t=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00001/putativeTalOrf.fasta outdir=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00001
#>> ## Now running annoTALE analyze for putativeTalOrf using the following command:
#>> ## java -jar /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/tools/AnnoTALEcli-1.4.1.jar analyze t=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00002/putativeTalOrf.fasta outdir=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00002
#>> ## Now running annoTALE analyze for putativeTalOrf using the following command:
#>> ## java -jar /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/tools/AnnoTALEcli-1.4.1.jar analyze t=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00003/putativeTalOrf.fasta outdir=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00003
#>> ## Now running annoTALE analyze for putativeTalOrf using the following command:
#>> ## java -jar /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/tools/AnnoTALEcli-1.4.1.jar analyze t=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00005/putativeTalOrf.fasta outdir=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00005
#>> ## Now running annoTALE analyze for putativeTalOrf using the following command:
#>> ## java -jar /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/tools/AnnoTALEcli-1.4.1.jar analyze t=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00006/putativeTalOrf.fasta outdir=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00006
#>> ## Now running annoTALE analyze for putativeTalOrf using the following command:
#>> ## java -jar /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/tools/AnnoTALEcli-1.4.1.jar analyze t=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00007/putativeTalOrf.fasta outdir=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00007
#>> ## Now running annoTALE analyze for putativeTalOrf using the following command:
#>> ## java -jar /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/tools/AnnoTALEcli-1.4.1.jar analyze t=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00008/putativeTalOrf.fasta outdir=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00008
#>> ## Now running annoTALE analyze for putativeTalOrf using the following command:
#>> ## java -jar /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/tools/AnnoTALEcli-1.4.1.jar analyze t=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00009/putativeTalOrf.fasta outdir=/home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction/annotale/ROI_00009
#>> #****************************************
#>> #** tellTale analysis done **
#>> Current date: Tue Aug 29 14:16:33 2023
#>> #_________Provided I/O parameters __________
#>> File of subject DNA sequences: /tmp/RtmpaJIkJj/file8c86a17eba4ab
#>> TALE N-term CDS region detection HMM file: /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/extdata/hmmProfile/Xo_TALE_Nterm_CDS_profile.hmm
#>> TALE repeat unit CDS detection HMM file: /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/extdata/hmmProfile/Xo_TALE_repeat_CDS_profile.hmm
#>> TALE C-term CDS region detection HMM file: /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/extdata/hmmProfile/Xo_TALE_Cterm_CDS_profile.hmm
#>> Output directory: /home/cunnac/TEMP/test_tantale/telltale2/BAI3-1-1_correction
#>> #____________Other parameters________________
#>> TALE_NtermDNAHitMinScore : 300
#>> repeatDNAHitMinScore : 20
#>> TALE_CtermDNAHitMinScore : 300
#>> minDomainHitsPerSubjSeq : 4
#>> mergeHits : TRUE
#>> minGapWidth : 35
#>> extendedLength : 300
#>> talArrayCorrection : TRUE
#>> refForTalArrayCorrection : /home/cunnac/R/x86_64-pc-linux-gnu-library/4.2/tantale/extdata/decipher_ref_tales_aa.fa.gz
#>> frameShiftCorrection : -11
#>> #__________Summary measures of TALE search outcome__________
#>> Number of analysed subject sequences : 2
#>> Total number of TALE repeat DNA coding sequence motif hits found with the nhmmer approach: 155
#>> Total number of subject seqs with TALE motif hits after low hit number filtering: 1
#>> Total number of distinct regions (repeat arrays) with adjacent TALE motifs : 9
#>> Total number of 'complete' arrays (with both N- and C-term flanking motifs): 8
#>> Minimum array length (number of TALE domain hits): 1
#>> Maximum array length: 28
#>> Median array length: 20
#>> Number of gaps of size below 500nt between TALE motifs arrays: 1.5
#>> First quartile of size of gaps (below 500nt) between TALE motifs arrays: 108
#>> Median size of gaps (below 500nt) between TALE motifs arrays: 108
#>> Upper quartile of size of gaps (below 500nt) between TALE motifs arrays: 108
#>> #__________Noteworthy AnnoTale issues__________
#>> #
#>> #*************************
Besides RVD sequences and Tal ORFs, other useful outputs of
telltale2() are the reports, from which we can get
information about the structure of Tal sequences, corrections have been
made, and so on.
The first we should take a look is “arrayReport.tsv”.Wwe will take the files from 3 result folders, gather them, and see which information we can display from it.
# color set for ploting
sunset10 <- c("#ba6b57", "#87556f", "#848ccf", "#93b5e1", "#6f4a8e", "#a2738c", "#ab93c9", "#588da8", "#726a95", "#bc658d")
arrayReport_files <- list.files(telltale2_outdir, "arrayReport.tsv", recursive = T, full.names = T)
arrayRp <- tibble()
for (f in arrayReport_files) {
a <- read_tsv(f)
if (!any(c("predicted_ins_count", "predicted_dels_count") %in% colnames(a)))
# 2 columns "predicted_ins_count" and "predicted_dels_count" are not displayed in the report in case of no correction
{a[c("predicted_ins_count", "predicted_dels_count")] <- NA}
a$strain <- gsub("\\_.+", "", basename(dirname(f)))
arrayRp <- rbind(arrayRp, a)
}
#>> Rows: 10 Columns: 15
#>> ── Column specification ────────────────────────────────────────────────────────
#>> Delimiter: "\t"
#>> chr (6): arrayID, OriginalSubjectName, Strand, ArraySeq, SeqOfRVD, LongestOR...
#>> dbl (7): Start, End, NumberOfHits, N.terminusAAlength, C.terminusAAlength, L...
#>> lgl (2): AllDomains, aberrantRepeat
#>>
#>> ℹ Use `spec()` to retrieve the full column specification for this data.
#>> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#>> Rows: 9 Columns: 17
#>> ── Column specification ────────────────────────────────────────────────────────
#>> Delimiter: "\t"
#>> chr (6): arrayID, OriginalSubjectName, Strand, ArraySeq, SeqOfRVD, LongestOR...
#>> dbl (9): Start, End, NumberOfHits, predicted_ins_count, predicted_dels_count...
#>> lgl (2): AllDomains, aberrantRepeat
#>>
#>> ℹ Use `spec()` to retrieve the full column specification for this data.
#>> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#>> Rows: 10 Columns: 15
#>> ── Column specification ────────────────────────────────────────────────────────
#>> Delimiter: "\t"
#>> chr (6): arrayID, OriginalSubjectName, Strand, ArraySeq, SeqOfRVD, LongestOR...
#>> dbl (7): Start, End, NumberOfHits, N.terminusAAlength, C.terminusAAlength, L...
#>> lgl (2): AllDomains, aberrantRepeat
#>>
#>> ℹ Use `spec()` to retrieve the full column specification for this data.
#>> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#>> Rows: 19 Columns: 15
#>> ── Column specification ────────────────────────────────────────────────────────
#>> Delimiter: "\t"
#>> chr (6): arrayID, OriginalSubjectName, Strand, ArraySeq, SeqOfRVD, LongestOR...
#>> dbl (7): Start, End, NumberOfHits, N.terminusAAlength, C.terminusAAlength, L...
#>> lgl (2): AllDomains, aberrantRepeat
#>>
#>> ℹ Use `spec()` to retrieve the full column specification for this data.
#>> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
arrayRp <- as_tibble(arrayRp)
This report takes result of HMMer for Tals motif detection, result of
correction and AnnoTALE analyze.
What information it contains:
colnames(arrayRp)
#>> [1] "arrayID" "OriginalSubjectName" "Start"
#>> [4] "End" "Strand" "NumberOfHits"
#>> [7] "ArraySeq" "AllDomains" "SeqOfRVD"
#>> [10] "aberrantRepeat" "N.terminusAAlength" "C.terminusAAlength"
#>> [13] "LongestOrfLength" "OrfCovOverArrayLength" "LongestORFSeq"
#>> [16] "predicted_ins_count" "predicted_dels_count" "strain"
Some important points:
| Column name | Content |
|---|---|
AllDomains |
TRUE/FALSE, whether all the 3 domains of Tal arrays are detected |
aberrantRepeat |
TRUE/FALSE, whether there is any aberrant repeat |
predicted_ins_count |
number of insertions corrected by telltale2() |
predicted_dels_count |
number of deletions corrected by telltale2() |
ggplot(arrayRp) +
geom_bar(aes(x = strain, fill = AllDomains, color = aberrantRepeat), stat = "count") +
scale_fill_manual(values = c("grey", sunset10[4])) +
scale_color_manual(values = c("black", "red"), na.value = NA, labels = c("FALSE", "TRUE", "AnnoTALE_failed")) +
guides(color = guide_legend(override.aes = list(fill = sunset10[4])))
total correction count per genome (in this case, there is only 1 genome that has been corrected)
arrayRp_correction <- reshape2::melt(arrayRp[c("strain", "arrayID", "predicted_ins_count", "predicted_dels_count")], value.name = "count", variable.name = "type")
#>> Using strain, arrayID as id variables
arrayRp_correction %<>% dplyr::arrange(strain, arrayID)
ggplot(arrayRp_correction) +
facet_wrap(vars(type), nrow = 2, scales = "fixed") +
geom_bar(aes(x = strain, fill = factor(count))) +
scale_fill_viridis_d() +
guides(fill = guide_legend(title = "correction count"))
correction count per array per genome
ggplot(na.omit(arrayRp_correction), aes(x = arrayID, y = count, color = type, shape = type)) +
facet_wrap(vars(strain), nrow = 2) +
scale_shape_manual(values = c(20, 2)) +
geom_point(size = 4) +
scale_color_manual(values = c(sunset10[4], "black"))+
theme(axis.text.x = element_text(angle = 90),
legend.direction = "horizontal", legend.position = "bottom")
correction distribution per genome
ggplot(na.omit(arrayRp_correction), aes(x = strain, y = count)) +
geom_dotplot(aes(fill = type, color = type), binaxis = "y", stackdir = "center", position = "dodge", binwidth = 1/10) +
scale_fill_manual(values = c(sunset10[4], "black")) +
scale_color_manual(values = c(sunset10[4], "black")) +
theme(legend.direction = "horizontal", legend.position = "bottom",
panel.grid.minor.y = element_blank())
Another report that we can reveal is “hitsReport.tsv”, which contains results of HMMer.
hitsReport_files <- dir(telltale2_outdir, "hitsReport.tsv", recursive = T, full.names = T)
hitsReport <- data.frame()
for (f in hitsReport_files) {
a <- read.table(f, sep = "\t", stringsAsFactors = F, header = T)
a$strain <- gsub("\\_.+", "", basename(dirname(f)))
hitsReport <- rbind(hitsReport, a)
}
hitsReport$query_name <- factor(hitsReport$query_name, levels = c("TALE_N-terminus_CDS_aligned_curated", "TALE_repeat_CDS_aligned_curated", "TALE_C-terminus_CDS_aligned_curated_long"), ordered = F)
information in the hitsReport
colnames(hitsReport)
#>> [1] "arrayID" "seqnames" "start" "end"
#>> [5] "width" "strand" "nhmmerHitID" "query_name"
#>> [9] "hitID" "seq" "codon_count" "frameshift_count"
#>> [13] "strain"
We could see the length distribution of each Tal protein domain
(query_name) predicted by HMMer.
ggplot(hitsReport) +
facet_wrap(vars(query_name), scales = "free", nrow = 1) +
geom_bar(aes(x = factor(codon_count), fill = strain), stat = "count", position = position_dodge(preserve = "single")) +
scale_fill_manual(values = sunset10) +
scale_y_sqrt(breaks = c(0, 1, 5, 10, 15, 100, 200, 350))
It seems that BAI3-1-1 has 1 Tal truncated at C-terminus caused by artifactual indels. But this is just the “raw” result of Tal motif detection.
Now we have a look at “domainsReport.tsv”, a similar report as “hitsReport.tsv” except that it contains the information of Tal domains predicted by AnnoTALE after correction, , so that we can make comparision and infer the impact of correction in the protein sequences.
domainsReport_file <- dir(telltale2_outdir, "domainsReport.tsv", recursive = T, full.names = T)
domainsReport <- data.frame()
for (f in domainsReport_file) {
a <- read.table(f, sep = "\t", stringsAsFactors = F, header = T)
a$strain <- gsub("\\_.+", "", basename(dirname(f)))
domainsReport <- rbind(domainsReport, a)
}
domainsReport$query_name <- factor(domainsReport$query_name, levels = c("N-terminus", "repeat", "C-terminus"))
domainsReport$codon_count <- as.factor(domainsReport$codon_count)
ggplot(domainsReport) +
facet_wrap(vars(query_name), scales = "free", nrow = 1) +
geom_bar(aes(x = codon_count, fill = strain), stat = "count", position = position_dodge(preserve = "single")) +
scale_fill_manual(values = sunset10) +
scale_y_sqrt(breaks = c(0, 1, 5, 10, 15, 100, 200, 350))
Firstly, the domain detection of AnnoTALE is different from HMMer at
some amino acids.
Secondly, the truncated C-terminus of BAI3-1-1 (84 aa) has been
corrected and has the same length as its counterparts.
To classify Tal groups, we use function runDistal() to
run the tool DisTal that outputs similarity between Tals and similarity
between repeats, then the Tals similarity can be taken as input of
groupTales() for clustering groups, and the repeats
similarity can be used for plotting multiple alignment of Tals within
group.
We run DisTal for all the putative TAL ORFs (from
telltale2 output).
distal_telltale <- file.path(outdir, "distal")
unlink(distal_telltale, recursive = TRUE)
dir.create(distal_telltale, showWarnings = F)
telltale2_cds <- file.path(distal_telltale, "cds.fasta")
unlink(telltale2_cds)
distal_telltale_input <- list.files(telltale2_outdir,
"putativeTalOrf.fasta",
recursive = T, full.names = T)
distal_telltale_input <- distal_telltale_input[!grepl("ROI", distal_telltale_input, fixed = T)]
for (input in distal_telltale_input) {
ttRunId <- gsub("\\_.+", "", basename(dirname(input)))
seqs <- readBStringSet(input, seek.first.rec = T)
names(seqs) <- paste0(ttRunId, "_", names(seqs))
seqs <- seqs[width(seqs) != 0]
#cat(names(seqs))
writeXStringSet(seqs, telltale2_cds, append = T)
}
# run DisTal
distal_output <- runDistal(telltale2_cds, outdir = distal_telltale, overwrite = T)
# distal_output$tal.similarity %>%as_tibble() %>% filter(TAL1 == | TAL2 == )
distalrOutDir <- file.path(outdir, "distalr")
unlink(distalrOutDir, recursive = TRUE)
dir.create(distalrOutDir, showWarnings = F)
taleParts <- lapply(list.dirs(telltale2_outdir, recursive = FALSE), function(ttGenomeDir) {
genomeId <- gsub("_.*correction", "", basename(ttGenomeDir))
logger::log_debug("Current genome: {genomeId}")
taleParts <- getTaleParts(ttGenomeDir) %>%
mutate(arrayID = paste0(genomeId, "_", arrayID))
return(taleParts)
}) %>%
bind_rows()
partsForPlots <- taleParts %>%
mutate(label = if_else(domainType == "repeat", rvd, ""),
aaSeqLength = factor(nchar(aaSeq))
)
partsForPlots %>% ggplot(mapping = aes(fill = aaSeqLength,
color = domainType,
label = label,
y = arrayID,
x = positionInArray),
color = isNaAaSeq) +
scale_color_viridis_d(option = "rocket") +
scale_fill_discrete() +
scale_x_continuous(breaks = 1:50, minor_breaks = NULL) +
geom_point(shape = 21, size = 5, stroke = 0.9) +
ggnewscale::new_scale_color() +
ggnewscale::new_scale_fill() +
geom_text(size = 2.1, color = "white") +
facet_grid(seqnames~ ., scales = "free_y", space = "free") +
theme_light()
partsForDistalr <- taleParts
# distalr_mms_output <- distalr(taleParts = partsForDistalr,
# repeats.cluster.h.cut = 10, ncores = 6,
# pairwiseAlnMethod = "mmseq2",
# condaBinPath = "/home/cunnac/bin/miniconda3/condabin/conda")
distalr_deci_output <- distalr(taleParts = partsForDistalr,
repeats.cluster.h.cut = 10, ncores = 6,
pairwiseAlnMethod = "DECIPHER")
distalr_bs_output <- distalr(taleParts = partsForDistalr,
repeats.cluster.h.cut = 10, ncores = 6,
pairwiseAlnMethod = "Biostrings")
library('corrr')
library(ggcorrplot)
talSimOutAgregated <- list(distalALV = distal_output$tal.similarity,
distalDECI = distalr_deci_output$tal.similarity,
distalBIOS = distalr_bs_output$tal.similarity
) %>% bind_rows(.id = "method")
talSimOutAgregatedLong <- reshape2::dcast(talSimOutAgregated %>% select(method, TAL1, TAL2, Sim),
formula = method ~ TAL1 + TAL2, value.var = "Sim"
)
rownames(talSimOutAgregatedLong) <- talSimOutAgregatedLong$method
talSimOutAgregatedLong$method <- NULL
talSimOutAgregatedLong %<>% as.matrix() %>% t()
rownames(talSimOutAgregatedLong) <- NULL
corr_matrix <- cor(talSimOutAgregatedLong)
knitr::kable(corr_matrix)
| distalALV | distalBIOS | distalDECI | |
|---|---|---|---|
| distalALV | 1.0000000 | 0.9533716 | 0.9547991 |
| distalBIOS | 0.9533716 | 1.0000000 | 0.9961920 |
| distalDECI | 0.9547991 | 0.9961920 | 1.0000000 |
ggcorrplot(corr_matrix)
talSimOutAgregated <- reshape2::dcast(talSimOutAgregated %>% select(method, TAL1, TAL2, Sim),
formula = TAL1 + TAL2 ~ method, value.var = "Sim"
) %>% as_tibble()
talSimOutAgregated %>%
select(-distalBIOS) %>%
mutate(pointLab = paste(TAL1, TAL2, sep = "|")) %>%
ggplot(mapping = aes(x = distalALV, distalDECI)) +
geom_point() +
geom_abline(intercept = 0, slope = 1)
k <- 26
groupsALV <- invisible(groupTales(taleSim = distal_output$tal.similarity,
plotTree = TRUE, k = k, k_test = 10:40,
method = "k-medoids"))
#>> tale tree will not be plotted!
#>> Number of groups is decided based on the provided value of k: 26
groupsDECI <- invisible(groupTales(taleSim = distalr_deci_output$tal.similarity,
plotTree = TRUE, k = k, k_test = 10:40,
method = "k-medoids"))
#>> tale tree will not be plotted!
#>> Number of groups is decided based on the provided value of k: 26
groupsBIOS <- invisible(groupTales(taleSim = distalr_bs_output$tal.similarity,
plotTree = TRUE, k = k, k_test = 10:40,
method = "k-medoids"))
#>> tale tree will not be plotted!
#>> Number of groups is decided based on the provided value of k: 26
groupTales(taleSim = distal_output$tal.similarity,
plotTree = TRUE, k = k,
method = "hclust")
#>> name group
#>> 1 BAI3_ROI_00001 1
#>> 2 BAI3_ROI_00002 2
#>> 3 BAI3_ROI_00003 3
#>> 4 BAI3_ROI_00004 4
#>> 5 BAI3_ROI_00006 5
#>> 6 BAI3_ROI_00007 6
#>> 7 BAI3_ROI_00008 7
#>> 8 BAI3_ROI_00009 8
#>> 9 BAI3_ROI_00010 9
#>> 10 BAI3-1-1_ROI_00001 2
#>> 11 BAI3-1-1_ROI_00002 3
#>> 12 BAI3-1-1_ROI_00003 4
#>> 13 BAI3-1-1_ROI_00005 5
#>> 14 BAI3-1-1_ROI_00006 6
#>> 15 BAI3-1-1_ROI_00007 7
#>> 16 BAI3-1-1_ROI_00008 8
#>> 17 BAI3-1-1_ROI_00009 9
#>> 18 MAI1_ROI_00001 1
#>> 19 MAI1_ROI_00002 2
#>> 20 MAI1_ROI_00003 10
#>> 21 MAI1_ROI_00004 4
#>> 22 MAI1_ROI_00006 5
#>> 23 MAI1_ROI_00007 6
#>> 24 MAI1_ROI_00008 7
#>> 25 MAI1_ROI_00009 8
#>> 26 MAI1_ROI_00010 9
#>> 27 PXO86_ROI_00001 11
#>> 28 PXO86_ROI_00002 12
#>> 29 PXO86_ROI_00003 13
#>> 30 PXO86_ROI_00004 14
#>> 31 PXO86_ROI_00006 15
#>> 32 PXO86_ROI_00007 16
#>> 33 PXO86_ROI_00008 17
#>> 34 PXO86_ROI_00009 18
#>> 35 PXO86_ROI_00010 19
#>> 36 PXO86_ROI_00011 20
#>> 37 PXO86_ROI_00012 21
#>> 38 PXO86_ROI_00013 22
#>> 39 PXO86_ROI_00014 23
#>> 40 PXO86_ROI_00015 23
#>> 41 PXO86_ROI_00016 24
#>> 42 PXO86_ROI_00017 25
#>> 43 PXO86_ROI_00018 26
#>> 44 PXO86_ROI_00019 11
groupTales(taleSim = distalr_deci_output$tal.similarity,
plotTree = TRUE, k = k,
method = "hclust")
#>> name group
#>> 1 BAI3_ROI_00001 1
#>> 2 BAI3_ROI_00002 2
#>> 3 BAI3_ROI_00003 3
#>> 4 BAI3_ROI_00004 4
#>> 5 BAI3_ROI_00006 5
#>> 6 BAI3_ROI_00007 6
#>> 7 BAI3_ROI_00008 7
#>> 8 BAI3_ROI_00009 8
#>> 9 BAI3_ROI_00010 9
#>> 10 BAI3-1-1_ROI_00001 2
#>> 11 BAI3-1-1_ROI_00002 3
#>> 12 BAI3-1-1_ROI_00003 4
#>> 13 BAI3-1-1_ROI_00005 5
#>> 14 BAI3-1-1_ROI_00006 6
#>> 15 BAI3-1-1_ROI_00007 7
#>> 16 BAI3-1-1_ROI_00008 8
#>> 17 BAI3-1-1_ROI_00009 9
#>> 18 MAI1_ROI_00001 1
#>> 19 MAI1_ROI_00002 2
#>> 20 MAI1_ROI_00003 3
#>> 21 MAI1_ROI_00004 4
#>> 22 MAI1_ROI_00006 5
#>> 23 MAI1_ROI_00007 10
#>> 24 MAI1_ROI_00008 7
#>> 25 MAI1_ROI_00009 8
#>> 26 MAI1_ROI_00010 9
#>> 27 PXO86_ROI_00001 11
#>> 28 PXO86_ROI_00002 12
#>> 29 PXO86_ROI_00003 13
#>> 30 PXO86_ROI_00004 14
#>> 31 PXO86_ROI_00006 15
#>> 32 PXO86_ROI_00007 16
#>> 33 PXO86_ROI_00008 17
#>> 34 PXO86_ROI_00009 18
#>> 35 PXO86_ROI_00010 19
#>> 36 PXO86_ROI_00011 20
#>> 37 PXO86_ROI_00012 21
#>> 38 PXO86_ROI_00013 22
#>> 39 PXO86_ROI_00014 23
#>> 40 PXO86_ROI_00015 23
#>> 41 PXO86_ROI_00016 24
#>> 42 PXO86_ROI_00017 25
#>> 43 PXO86_ROI_00018 26
#>> 44 PXO86_ROI_00019 11
groupTales(taleSim = distalr_bs_output$tal.similarity,
plotTree = TRUE, k = k,
method = "hclust")
#>> name group
#>> 1 BAI3_ROI_00001 1
#>> 2 BAI3_ROI_00002 2
#>> 3 BAI3_ROI_00003 3
#>> 4 BAI3_ROI_00004 4
#>> 5 BAI3_ROI_00006 5
#>> 6 BAI3_ROI_00007 6
#>> 7 BAI3_ROI_00008 7
#>> 8 BAI3_ROI_00009 8
#>> 9 BAI3_ROI_00010 9
#>> 10 BAI3-1-1_ROI_00001 2
#>> 11 BAI3-1-1_ROI_00002 3
#>> 12 BAI3-1-1_ROI_00003 4
#>> 13 BAI3-1-1_ROI_00005 5
#>> 14 BAI3-1-1_ROI_00006 6
#>> 15 BAI3-1-1_ROI_00007 7
#>> 16 BAI3-1-1_ROI_00008 8
#>> 17 BAI3-1-1_ROI_00009 9
#>> 18 MAI1_ROI_00001 1
#>> 19 MAI1_ROI_00002 2
#>> 20 MAI1_ROI_00003 3
#>> 21 MAI1_ROI_00004 4
#>> 22 MAI1_ROI_00006 5
#>> 23 MAI1_ROI_00007 10
#>> 24 MAI1_ROI_00008 7
#>> 25 MAI1_ROI_00009 8
#>> 26 MAI1_ROI_00010 9
#>> 27 PXO86_ROI_00001 11
#>> 28 PXO86_ROI_00002 12
#>> 29 PXO86_ROI_00003 13
#>> 30 PXO86_ROI_00004 14
#>> 31 PXO86_ROI_00006 15
#>> 32 PXO86_ROI_00007 16
#>> 33 PXO86_ROI_00008 17
#>> 34 PXO86_ROI_00009 18
#>> 35 PXO86_ROI_00010 19
#>> 36 PXO86_ROI_00011 20
#>> 37 PXO86_ROI_00012 21
#>> 38 PXO86_ROI_00013 22
#>> 39 PXO86_ROI_00014 23
#>> 40 PXO86_ROI_00015 23
#>> 41 PXO86_ROI_00016 24
#>> 42 PXO86_ROI_00017 25
#>> 43 PXO86_ROI_00018 26
#>> 44 PXO86_ROI_00019 11
getTALEsTree <- function(taleSim) {
distMat <- 100 - reshape2::acast(taleSim, formula = TAL1 ~ TAL2, value.var = "Sim")
hclust(dist(distMat, method = "euclidean"), method = "ward.D")
}
getTALEsTree(distal_output$tal.similarity) %>%
ggtree::ggtree() +
ggtree::layout_dendrogram() +
ggtree::geom_tiplab(ggtree::aes(label=label),
angle= 90, hjust=0,
offset = -3,
, color='black')
getTALEsTree(distalr_deci_output$tal.similarity) %>%
ggtree::ggtree() +
ggtree::layout_dendrogram() +
ggtree::geom_tiplab(ggtree::aes(label=label),
angle= 90, hjust=0,
offset = -3,
, color='black')
getTALEsTree(distalr_bs_output$tal.similarity) %>%
ggtree::ggtree() +
ggtree::layout_dendrogram() +
ggtree::geom_tiplab(ggtree::aes(label=label),
angle= 90, hjust=0,
offset = -3,
, color='black')
ape::cophyloplot(getTALEsTree(distal_output$tal.similarity) %>% as.phylo(),
getTALEsTree(distalr_deci_output$tal.similarity) %>% as.phylo(),
assoc = matrix(rep(distal_output$tal.similarity$TAL1 %>% unique() %>% as.character(), 2),
ncol = 2, byrow = FALSE),
length.line = 0, space=200, gap=10,
col = "red")
taleGroups <- groupsBIOS
Based on the silhouette values, groupTales() will take
the value at the elbow of the curve. In this case it picked k = 9, and
it makes sense as we already know that the each of the gold genomes have
9 Tals.
We can make MSA of repeatID sequence or RVD sequence with the
function buildRepeatMsa(), however it make more sense to
align repeat than RVD. For mafft, the combo of gap opening score = 0 and
gap extending score = 5 creates better alignment than default
values.
repeatVectors <- distalr_deci_output$coded.repeats.str
repeatSim <- distalr_deci_output$repeat.similarity %>% select(-Dissim) #distal_output$repeat.similarity
repeatMsaByGroup_withSim <- lapply(sort(unique(taleGroups$group)), function(G) {
repeatVectorsForGroup <- repeatVectors[taleGroups$name[taleGroups$group == G]]
if (length(repeatVectorsForGroup) < 2) {
msa <- as.matrix(as.data.frame(repeatVectorsForGroup))
msa <- matrix(msa, nrow = 1)
rownames(msa) <- colnames(as.data.frame(repeatVectorsForGroup))
colnames(msa) <- 1:length(msa)
return(msa)
next()
}
repeatMsa <- buildRepeatMsa(inputSeqs = repeatVectorsForGroup, sep = " ",
distalRepeatSims = repeatSim,
mafftOpts = "--globalpair --weighti 1 --maxiterate 1000 --reorder --op 0 --ep 5 --thread 1")
}
)
For this type of plot, we need to map repeatID to RVD with
getRepeat2RvdMapping(), then convert repeat MSA to RVD MSA
by convertRepeat2RvdAlign. We should use the converting
function rather than using buildRepeatMsa() to creat MSA of
RVD because there will be disagreement between them.
rvdSeqs_fasta <- list.files(telltale2_outdir, "rvdSequences.fas", recursive = T, full.names = T)
telltale2_rvd <- file.path(distal_telltale, "rvd.fasta")
unlink(telltale2_rvd)
# rename of RVD sequences to distinguish sequences between different strains
for (input in rvdSeqs_fasta) {
id <- gsub("\\_.+", "", basename(dirname(input)))
seqs <- readBStringSet(input, seek.first.rec = T)
names(seqs) <- paste0(id, "_", names(seqs))
names(seqs) <- stringr::str_trim(names(seqs))
seqs <- seqs[width(seqs) != 0]
writeXStringSet(seqs, telltale2_rvd, append = T)
}
rvdVectors <- toListOfSplitedStr(telltale2_rvd, sep = "-")
repeatVectors <- toListOfSplitedStr(distalr_deci_output$coded.repeats.str, sep = " ")
# map repeatID to RVD
rep2rvd <- getRepeat2RvdMapping(talesRepeatVectors = repeatVectors, talesRvdVectors = rvdVectors)
datatable(rep2rvd, options = list(pageLength = 5))
# convert repeat MSA to RVD MSA
repeatMSA <- repeatMsaByGroup_withSim[[2]]
rvdMSA <- convertRepeat2RvdAlign(repeatAlign = repeatMSA, repeat2RvdMapping = rep2rvd)
# plot MSA
tantale::heatmap_msa(talsim = distalr_deci_output$tal.similarity,
repeatAlign = repeatMSA,
repeatSim = distalr_deci_output$repeat.similarity,
rvdAlign = rvdMSA,
plot.type = "repeat.similarity",
main = "Repeat MSA with similarity - Group 3")
tantale::heatmap_msa(talsim = distalr_deci_output$tal.similarity,
repeatAlign = repeatMSA,
repeatSim = distalr_deci_output$repeat.similarity,
plot.type = "repeat.clusters",
main = "Repeat Cluster MSA - Group 3")
For this plot, we also use the RVD MSA converted from repeat MSA.
rvdMSA <- convertRepeat2RvdAlign(repeatAlign = repeatMSA, repeat2RvdMapping = rep2rvd)
tantale::heatmap_msa(talsim = distalr_deci_output$tal.similarity,
repeatAlign = repeatMSA,
repeatSim = distalr_deci_output$repeat.similarity,
rvdAlign = rvdMSA,
plot.type = "repeat.clusters.with.rvd",
main = "Repeat Cluster MSA with RVD - Group 3",
consensusSeq = TRUE)
With the Tal classification result and RVD sequences, we can represent RVD sequence variants of all the strains for each Tal group as an example below.
tale_annotation <- merge(taleGroups,
data.frame("name" = names(rvdVectors),
"rvdseq" = sapply(rvdVectors,
function(i) paste(i, collapse = "-")
), stringsAsFactors = FALSE
)
)
tale_annotation$strain <- gsub("\\_.+", "", tale_annotation$name)
heatmap_talomes(tale_annotation,
col = "group", row = "strain",
value = "rvdseq",
mapcol = sunset10)
We wrap Talvez and Preditale tools in 2 functions,
talvez() and preditale(), that take the same
types of RVD sequences and promoter DNA sequences as inputs and output a
data frame containing predictions.
We need to concatenate all RVD sequences from 3
telltale2 outputs and remove NTERM and CTERM markers.
## rvd sequences from telltale2 output
rvdSeqs_tt <- readBStringSet(telltale2_rvd)
rvdSeqs_tt <- gsub("\\-{0,1}\\w{5}\\-{0,1}", "",rvdSeqs_tt) %>% BStringSet() # remove NTERM and CTERM
predict_input <- file.path(outdir, "rvd_to_predict.fasta")
writeXStringSet(rvdSeqs_tt, predict_input)
## promoter sequences
rvdSeqsXstrings <- predict_input
subjDnaSeqFile <- system.file("extdata", "cladeIII_sweet_promoters.fasta", package = "tantale", mustWork = T)
readBStringSet(subjDnaSeqFile) %>% names
#>> [1] "SWEET11p_IR64_Sense" "SWEET11p_BT07_Sense"
#>> [3] "SWEET11p_93-11_Sense" "SWEET11p_Nipponbare_Sense"
#>> [5] "SWEET13p_BT7_Sense" "SWEET13p_IR24_Sense"
#>> [7] "SWEET13p_Nipponbare_Sense" "SWEET14p_BT07_Sense"
#>> [9] "SWEET14p_Nipponbare_Sense"
The output of talvez() contains the position, score and
rank of each prediction.
talvezPreds <- talvez(rvdSeqs = rvdSeqsXstrings,
subjDnaSeqFile = subjDnaSeqFile,
optParam = "-t 0 -l 19",
condaBinPath = "/home/cunnac/bin/miniconda3/condabin/conda")
datatable(talvezPreds, options = list(pageLength = 1))
Here is an example of summarizing prediction result with
heatmap.2() (scores are display by color scale, black
refers no prediction):
talvezPreds$strain <- gsub("\\_.+", "", talvezPreds$taleId)
talvezPreds$group <- sapply(talvezPreds$taleId, function(id) {
ifelse(id %in% taleGroups$name, taleGroups$group[taleGroups$name == id], NA)
}, USE.NAMES = F)
slctPromoters <- c("SWEET14p_Nipponbare_Sense", "SWEET13p_Nipponbare_Sense", "SWEET11p_Nipponbare_Sense")
selectedPreds <- talvezPreds %>%
dplyr::filter(subjSeqId %in% slctPromoters)
selectedPredsMat <- selectedPreds %>%
reshape2::acast(taleId ~ subjSeqId, max, na.rm = TRUE, value.var = "score", fill = as.single(NA))
heatmap.2(selectedPredsMat,
col = viridis::viridis(20), #cm.colors(255),
sepwidth=c(0.02,0.02), sepcolor="white", colsep = 1:ncol(selectedPredsMat), rowsep = 1:nrow(selectedPredsMat),
trace="none",
margins = c(10, 15),
cexRow = 1,
cexCol = 1, srtCol = 40,
density.info= "histogram", key.xlab = "Pred. Score", key.title = NA,
lwid = c(1,5), lhei = c(1,5),
dendrogram = "none", Rowv = NULL, Colv = NULL,
na.color = "black",
main = "Talvez prediction"
)
To see how RVD sequences match a promoter region, we supply the
prediction table and the promoter sequence with the genomic range to
plotTaleTargetPred():
grFilter <- "SWEET14p_Nipponbare_Sense:340-450"
plotTaleTargetPred(predResults = selectedPreds, subjDnaSeqFile = subjDnaSeqFile, filterRange = grFilter)
#>> Registered S3 methods overwritten by 'ggalt':
#>> method from
#>> grid.draw.absoluteGrob ggplot2
#>> grobHeight.absoluteGrob ggplot2
#>> grobWidth.absoluteGrob ggplot2
#>> grobX.absoluteGrob ggplot2
#>> grobY.absoluteGrob ggplot2
With Preditale, we get similar data frame output but with p value additionally. Preditale takes a little bit longer than Talvez and may give some different predictions.
preditalePreds <- preditale(rvdSeqs = rvdSeqsXstrings, subjDnaSeqFile = subjDnaSeqFile, outDir = NULL)
datatable(preditalePreds, options = list(pageLength = 1))
talvezPreds$strain <- gsub("\\_.+", "", talvezPreds$taleId)
talvezPreds$group <- sapply(talvezPreds$taleId, function(id) {
ifelse(id %in% taleGroups$name, taleGroups$group[taleGroups$name == id], NA)
}, USE.NAMES = F)
slctPromoters <- c("SWEET14p_Nipponbare_Sense", "SWEET13p_Nipponbare_Sense", "SWEET11p_Nipponbare_Sense")
selectedPreds <- preditalePreds %>%
dplyr::filter(subjSeqId %in% slctPromoters)
selectedPredsMat <- selectedPreds %>%
reshape2::acast(taleId ~ subjSeqId, max, na.rm = TRUE, value.var = "score", fill = as.single(NA))
gplots::heatmap.2(selectedPredsMat,
col = viridis::viridis(20), #cm.colors(255),
sepwidth=c(0.02,0.02), sepcolor="white", colsep = 1:ncol(selectedPredsMat), rowsep = 1:nrow(selectedPredsMat),
trace="none",
margins = c(10, 15),
cexRow = 1,
cexCol = 1, srtCol = 40,
density.info= "histogram", key.xlab = "Pred. Score", key.title = NA,
lwid = c(1,5), lhei = c(1,5),
dendrogram = "none", Rowv = NULL, Colv = NULL,
na.color = "black",
main = "Preditale prediction"
)
grFilter <- "SWEET14p_Nipponbare_Sense:340-450"
plotTaleTargetPred(predResults = selectedPreds, subjDnaSeqFile = subjDnaSeqFile, filterRange = grFilter)